#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Ignore Warnings
import warnings
warnings.filterwarnings("ignore")
#load dataset
df = pd.read_csv("C:\\Users\\vaish\\Downloads\\archive (1)\\cybersecurity_attacks.csv")
#Checkout the dataset
df.head().T
df.columns
print(f"There are {df.shape[0]}, row and {df.shape[1]} columns in the dataset")
df.info()
#check missing values/null values
df.isnull().sum().sort_values(ascending=False)
#check missing values by percentage
df.isnull().sum() / len(df) * 100
There are 40000, row and 25 columns in the dataset <class 'pandas.core.frame.DataFrame'> RangeIndex: 40000 entries, 0 to 39999 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Timestamp 40000 non-null object 1 Source IP Address 40000 non-null object 2 Destination IP Address 40000 non-null object 3 Source Port 40000 non-null int64 4 Destination Port 40000 non-null int64 5 Protocol 40000 non-null object 6 Packet Length 40000 non-null int64 7 Packet Type 40000 non-null object 8 Traffic Type 40000 non-null object 9 Payload Data 40000 non-null object 10 Malware Indicators 20000 non-null object 11 Anomaly Scores 40000 non-null float64 12 Alerts/Warnings 19933 non-null object 13 Attack Type 40000 non-null object 14 Attack Signature 40000 non-null object 15 Action Taken 40000 non-null object 16 Severity Level 40000 non-null object 17 User Information 40000 non-null object 18 Device Information 40000 non-null object 19 Network Segment 40000 non-null object 20 Geo-location Data 40000 non-null object 21 Proxy Information 20149 non-null object 22 Firewall Logs 20039 non-null object 23 IDS/IPS Alerts 19950 non-null object 24 Log Source 40000 non-null object dtypes: float64(1), int64(3), object(21) memory usage: 7.6+ MB
Timestamp 0.0000 Source IP Address 0.0000 Destination IP Address 0.0000 Source Port 0.0000 Destination Port 0.0000 Protocol 0.0000 Packet Length 0.0000 Packet Type 0.0000 Traffic Type 0.0000 Payload Data 0.0000 Malware Indicators 50.0000 Anomaly Scores 0.0000 Alerts/Warnings 50.1675 Attack Type 0.0000 Attack Signature 0.0000 Action Taken 0.0000 Severity Level 0.0000 User Information 0.0000 Device Information 0.0000 Network Segment 0.0000 Geo-location Data 0.0000 Proxy Information 49.6275 Firewall Logs 49.9025 IDS/IPS Alerts 50.1250 Log Source 0.0000 dtype: float64
#Handle the Missing Values
#If the Alert Triggered is present, then it's a yes, else it's a no
df['Alerts/Warnings'] = df['Alerts/Warnings'].apply(lambda x: 'yes' if x == 'Alert Triggered' else 'no')
#If the Malware Indicators is present, then it's a No, else it's a No Detection.
df['Malware Indicators'] = df['Malware Indicators'].apply(lambda x: 'No Detection' if pd.isna(x) else x)
#If Proxy Information is missing, it is assumed that there is no proxy
df['Proxy Information'] = df['Proxy Information'].apply(lambda x: 'No proxy' if pd.isna(x) else x)
#If Firewall Logs is missing, it is assumed that there is no data
df['Firewall Logs'] = df['Firewall Logs'].apply(lambda x: 'No Data' if pd.isna(x) else x)
#If IDS/IPS Alerts is "No Data", then it means that the alert was not generated by IDS/IPS.
df['IDS/IPS Alerts'] = df['IDS/IPS Alerts'].apply(lambda x: 'No Data' if pd.isna(x) else x)
#Missing values removed
df.isnull().sum().sort_values(ascending=False)
Timestamp 0 Attack Type 0 IDS/IPS Alerts 0 Firewall Logs 0 Proxy Information 0 Geo-location Data 0 Network Segment 0 Device Information 0 User Information 0 Severity Level 0 Action Taken 0 Attack Signature 0 Alerts/Warnings 0 Source IP Address 0 Anomaly Scores 0 Malware Indicators 0 Payload Data 0 Traffic Type 0 Packet Type 0 Packet Length 0 Protocol 0 Destination Port 0 Source Port 0 Destination IP Address 0 Log Source 0 dtype: int64
#Explore the Device Information Column
# Extract 'Device'
df['Browser'] = df['Device Information'].str.split('/').str[0]
df['Browser']
import re
# OS and device patterns to search for
patterns = [
r'Windows',
r'Linux',
r'Android',
r'iPad',
r'iPod',
r'iPhone',
r'Macintosh',
]
def extract_device_or_os(user_agent):
for pattern in patterns:
match = re.search(pattern, user_agent, re.I) # re.I makes the search case-insensitive
if match:
return match.group()
return 'Unknown' # Return 'Unknown' if no patterns match
# Extract device or OS
df['Device/OS'] = df['Device Information'].apply(extract_device_or_os)
df['Browser'].value_counts()
#Dropping the Device Information Column
df = df.drop('Device Information', axis = 1)
def extract_time_features(df, Timestamp):
# Convert timestamp column to datetime if it's not already
df[Timestamp] = pd.to_datetime(df[Timestamp])
# Extract time features
df['Year'] = df[Timestamp].dt.year
df['Month'] = df[Timestamp].dt.month
df['Day'] = df[Timestamp].dt.day
df['Hour'] = df[Timestamp].dt.hour
df['Minute'] = df[Timestamp].dt.minute
df['Second'] = df[Timestamp].dt.second
df['DayOfWeek'] = df[Timestamp].dt.dayofweek
return df
# Assuming df is your DataFrame
# Call the function and store the result in a new DataFrame
new_df = extract_time_features(df, 'Timestamp')
# Check if new columns are created
print(new_df.head())
# Assuming df is your DataFrame
# Call the function and store the result in a new DataFrame
new_df = extract_time_features(df, 'Timestamp')
# Check if new columns are created
print(new_df.head())
df.head().T
df.describe(include = 'object').T
df.columns
Timestamp Source IP Address Destination IP Address Source Port \
0 2023-05-30 06:33:58 103.216.15.12 84.9.164.252 31225
1 2020-08-26 07:08:30 78.199.217.198 66.191.137.154 17245
2 2022-11-13 08:23:25 63.79.210.48 198.219.82.17 16811
3 2023-07-02 10:38:46 163.42.196.10 101.228.192.255 20018
4 2023-07-16 13:11:07 71.166.185.76 189.243.174.238 6131
Destination Port Protocol Packet Length Packet Type Traffic Type \
0 17616 ICMP 503 Data HTTP
1 48166 ICMP 1174 Data HTTP
2 53600 UDP 306 Control HTTP
3 32534 UDP 385 Data HTTP
4 26646 TCP 1462 Data DNS
Payload Data ... Log Source Browser \
0 Qui natus odio asperiores nam. Optio nobis ius... ... Server Mozilla
1 Aperiam quos modi officiis veritatis rem. Omni... ... Firewall Mozilla
2 Perferendis sapiente vitae soluta. Hic delectu... ... Firewall Mozilla
3 Totam maxime beatae expedita explicabo porro l... ... Firewall Mozilla
4 Odit nesciunt dolorem nisi iste iusto. Animi v... ... Firewall Mozilla
Device/OS Year Month Day Hour Minute Second DayOfWeek
0 Windows 2023 5 30 6 33 58 1
1 Windows 2020 8 26 7 8 30 2
2 Windows 2022 11 13 8 23 25 6
3 Macintosh 2023 7 2 10 38 46 6
4 Windows 2023 7 16 13 11 7 6
[5 rows x 33 columns]
Timestamp Source IP Address Destination IP Address Source Port \
0 2023-05-30 06:33:58 103.216.15.12 84.9.164.252 31225
1 2020-08-26 07:08:30 78.199.217.198 66.191.137.154 17245
2 2022-11-13 08:23:25 63.79.210.48 198.219.82.17 16811
3 2023-07-02 10:38:46 163.42.196.10 101.228.192.255 20018
4 2023-07-16 13:11:07 71.166.185.76 189.243.174.238 6131
Destination Port Protocol Packet Length Packet Type Traffic Type \
0 17616 ICMP 503 Data HTTP
1 48166 ICMP 1174 Data HTTP
2 53600 UDP 306 Control HTTP
3 32534 UDP 385 Data HTTP
4 26646 TCP 1462 Data DNS
Payload Data ... Log Source Browser \
0 Qui natus odio asperiores nam. Optio nobis ius... ... Server Mozilla
1 Aperiam quos modi officiis veritatis rem. Omni... ... Firewall Mozilla
2 Perferendis sapiente vitae soluta. Hic delectu... ... Firewall Mozilla
3 Totam maxime beatae expedita explicabo porro l... ... Firewall Mozilla
4 Odit nesciunt dolorem nisi iste iusto. Animi v... ... Firewall Mozilla
Device/OS Year Month Day Hour Minute Second DayOfWeek
0 Windows 2023 5 30 6 33 58 1
1 Windows 2020 8 26 7 8 30 2
2 Windows 2022 11 13 8 23 25 6
3 Macintosh 2023 7 2 10 38 46 6
4 Windows 2023 7 16 13 11 7 6
[5 rows x 33 columns]
Index(['Timestamp', 'Source IP Address', 'Destination IP Address',
'Source Port', 'Destination Port', 'Protocol', 'Packet Length',
'Packet Type', 'Traffic Type', 'Payload Data', 'Malware Indicators',
'Anomaly Scores', 'Alerts/Warnings', 'Attack Type', 'Attack Signature',
'Action Taken', 'Severity Level', 'User Information', 'Network Segment',
'Geo-location Data', 'Proxy Information', 'Firewall Logs',
'IDS/IPS Alerts', 'Log Source', 'Browser', 'Device/OS', 'Year', 'Month',
'Day', 'Hour', 'Minute', 'Second', 'DayOfWeek'],
dtype='object')
# Checking the Day Column ploting with plotly
plt = px.histogram(df, x = 'Day', color = 'Malware Indicators', title = 'Number of Malware Attacks by Day')
plt.show()
# month Distribution
plt = px.histogram(df, x = 'Month', title = 'Month')
plt.show()
# Checking the Month Column ploting with plotly
plt = px.histogram(df, x = 'Month', color = 'Malware Indicators', title = 'Number of Malware Attacks by Month')
plt.show()
# Year Distrition
plt = px.histogram(df, x='Year', title = 'Year')
plt.show()
# Checking the year Column ploting with plotly
plt = px.histogram(df, x = 'Year', color = 'Malware Indicators', title = 'Number of Malware Attacks by Year')
plt.show()
# Checking the Protocol distribution with Bar Chart Using Plotly
plt = px.histogram(df, x = 'Protocol', color = 'Malware Indicators', title = 'Number of Malware Attacks by Protocol')
plt.show()
# Traffic Distribution
plt = px.pie(df, names = 'Traffic Type', title = 'Traffic Distribution')
plt.show()
# Ploting the Traffic Type distribution with Bar Chart Using Plotly
plt = px.histogram(df, x = 'Traffic Type', color = 'Malware Indicators', title = 'Number of Malware Attacks by Traffic Type')
plt.show()
# Attack Type Distribution
plt = px.pie(df, names = 'Attack Type', title = 'Attack Type Distribution')
plt.show()
# Checking the attack types distribution with Bar Chart Using Plotly
plt = px.histogram(df, x='Attack Type', color='Traffic Type', title='Number of Malware Attacks by Attack Type')
plt.show()
# Browsers Distribution
plt = px.pie(df, names = 'Browser', title = 'Browser Distribution')
plt.show()
# Platform Distribution
plt = px.pie(df, names = 'Device/OS', title = 'Platform Distribution')
plt.show()
# Platform Distribution with Bar Chart
plt = px.histogram(df, x ='Device/OS', color= 'Browser', title = 'Platform Distribution')
plt.show()
# Checking the Browser and Devices with Attack Type distribution with Bar Chart Using Plotly
plt = px.histogram(df, x= 'Device/OS', color = 'Attack Type', title = 'Number of Malware Attacks by Browser and Devices')
plt.show()
# checking the browser against the attack type
plt = px.histogram(df, x= 'Browser', color='Attack Type', title= 'Number of Attacks by Browser')
plt.show()
# Log Source Distribution
plt = px.histogram(df, x='Log Source', title='Log Source')
plt.show()
# Log Source Distribution
plt = px.histogram(df, x='Action Taken', title='Action Taken')
plt.show()
# Log Source Distribution
plt = px.histogram(df, x='Action Taken', color='Attack Type', title='Log Source')
plt.show()
# Log Source Distribution
plt = px.histogram(df, x='Log Source', color='Attack Type', title='Log Source')
plt.show()
#Check the Packet Length of Malware, Intrusion and DDoS and compare
import plotly.graph_objs as go
# Filter data for each attack type
malware_data = df[df['Attack Type'] == 'Malware']['Packet Length']
intrusion_data = df[df['Attack Type'] == 'Intrusion']['Packet Length']
ddos_data = df[df['Attack Type'] == 'DDoS']['Packet Length']
# Create histograms for each attack type
malware_histogram = go.Histogram(x=malware_data, name='Malware', opacity=0.7)
intrusion_histogram = go.Histogram(x=intrusion_data, name='Intrusion', opacity=0.7)
ddos_histogram = go.Histogram(x=ddos_data, name='DDoS', opacity=0.7)
# Create layout
layout = go.Layout(title='Packet Length Distribution for Different Attack Types',
xaxis=dict(title='Packet Length'),
yaxis=dict(title='Frequency'))
# Create figure
fig = go.Figure(data=[malware_histogram, intrusion_histogram, ddos_histogram], layout=layout)
# Show plot
fig.show()